setwd("/Users/vh3/Documents/MCA/ANALYSIS_3")
#load required packages
require("Matrix")
library(scater, quietly = TRUE)
require("SingleCellExperiment")
options(stringsAsFactors = FALSE)
library(plotly)
molecules <- read.table("allcounts4.csv", header = TRUE, sep = ",", row.names=1, stringsAsFactors = TRUE)
anno <- read.delim("allpheno8.2.csv", header = TRUE, sep = ",")
anno$is_control <- rep("FALSE", length(anno$Name_of_Sample))
anno[which(anno$Name_of_Sample != "SC"), ]$is_control <- "TRUE"
anno[which(anno$Plate=="core"), ]$Sortdate <- "Fall2017"
anno$col <- rep("violet", length(anno$sample_id))
anno[which(anno$ShortenedLifeStage=="bbSpz"), ]$col <- "navy"
anno[which(anno$ShortenedLifeStage=="ooSpz"), ]$col <- "lightskyblue"
anno[which(anno$ShortenedLifeStage2=="Female"), ]$col <- "purple4"
anno[which(anno$ShortenedLifeStage2=="Male"), ]$col <- "purple"
anno[which(anno$ShortenedLifeStage=="sgSpz"), ]$col <- "royalblue"
anno[which(anno$ShortenedLifeStage=="EEF"), ]$col <- "darkorange"
anno[which(anno$ShortenedLifeStage=="ook"), ]$col <- "turquoise4"
anno[which(anno$ShortenedLifeStage=="oocyst"), ]$col <- "steelblue"
anno[which(anno$ShortenedLifeStage=="Ring"), ]$col <- "hotpink"
anno[which(anno$ShortenedLifeStage=="Merozoite"), ]$col <- "lightpink"
anno[which(anno$ShortenedLifeStage2=="Trophozoite"), ]$col <- "violet"
anno[which(anno$ShortenedLifeStage=="ookoo"), ]$col <- "mediumturquoise"
cellnames <- colnames(molecules)
anno <- anno[match(cellnames, anno$sample_id), ]
cols <- c("bbSpz" = "navy", "EEF"="darkorange", "Merozoite"="lightpink", "oocyst"="steelblue", "ook" = "turquoise4", "ooSpz" = "lightskyblue", "Ring"="hotpink", "sgSpz"= "royalblue", "Schizont" = "violet", "Male"="purple", "Female"="purple4", "ookoo" = "mediumturquoise", "Trophozoite"="violet")
knitr::kable(
head(molecules[ , 1:3]), booktabs = TRUE,
caption = 'A table of the first 6 rows and 3 columns of the molecules table.'
)
| oocyst_1 | oocyst_2 | oocyst_3 | |
|---|---|---|---|
| PBANKA_0000101 | 0 | 0 | 0 |
| PBANKA_0000201 | 0 | 0 | 0 |
| PBANKA_0000301 | 0 | 0 | 0 |
| PBANKA_0000401 | 0 | 0 | 0 |
| PBANKA_0000600 | 0 | 0 | 0 |
| PBANKA_0000701 | 0 | 0 | 0 |
knitr::kable(
head(anno[ , 1:3]), booktabs = TRUE,
caption = 'A table of the first 6 rows and 3 columns of the molecules table.'
)
| sample_id | sample_name_for_seq | npgnum |
|---|---|---|
| oocyst_1 | 9A_oocyst2_SC_Pbe_oocyst | 1 |
| oocyst_2 | 2B_oocyst1_SC_Pbe_oocyst | 2 |
| oocyst_3 | 3B_oocyst1_SC_Pbe_oocyst | 3 |
| oocyst_4 | 4B_oocyst1_SC_Pbe_oocyst | 4 |
| oocyst_5 | 5B_oocyst1_SC_Pbe_oocyst | 5 |
| oocyst_6 | 6B_oocyst1_SC_Pbe_oocyst | 6 |
make MCA object
# Set up objects
mca <- SingleCellExperiment(assays = list(
counts = as.matrix(molecules),
logcounts = log2(as.matrix(molecules) + 1)
), colData = anno)
# Set up objects
mca[which(mca$Plate=="core"), ]$Sortdate <- "Fall2017"
mca <- mca[, (colData(mca)$Sortdate != "19.12.2017")] #Removing first rep of bbSPZ because of low genes per cell see Removingbadbbspz_20180424.Rmd
anno2 <- droplevels(subset(anno, Sortdate != "19.12.2017"))
#remove ooSpz
mca <- mca[, (colData(mca)$ShortenedLifeStage != "ooSpz")]
anno2 <- droplevels(subset(anno2, ShortenedLifeStage != "ooSpz"))
#remove ookinetes clusters that are not expressing CTRP
mca <- mca[, (colData(mca)$trueook == "include")]
anno2 <- droplevels(subset(anno2, trueook == "include"))
#Remove asexuals that didn't get classified because they don't pass QC (57 cells)
mca <- mca[, (colData(mca)$ShortenedLifeStage2 != "Shz")]
# define feature names in feature_symbol column
rowData(mca)$feature_symbol <- rownames(mca)
saveRDS(mca, "Pbmca_20180625.rds")
#calculate the quality metrics
mca <- calculateQCMetrics(mca)
#Inspect poor quality cells
tab <- as.data.frame(colData(mca))
ggplot(tab, aes(x=total_features, fill = ShortenedLifeStage2)) + geom_histogram(binwidth = 50) + facet_grid(ShortenedLifeStage2~., scales="free")
ggplot(tab, aes(x=ShortenedLifeStage, total_features)) + geom_violin() + coord_flip() + stat_summary(fun.y = median, geom = "point", size=2, color="red") + theme_minimal()
tapply(tab$total_counts, tab$ShortenedLifeStage2, median)
## bbSpz EEF Female Male Merozoite oocyst
## 69433.5 195956.0 249797.0 212727.0 243566.5 63302.0
## ook ookoo Ring Schizont sgSpz Trophozoite
## 479809.0 289963.5 462611.0 333244.5 60599.5 250863.0
tf <- tapply(tab$total_features, tab$ShortenedLifeStage2, median)
tf
## bbSpz EEF Female Male Merozoite oocyst
## 859.5 2977.0 2527.0 1576.0 202.0 2995.0
## ook ookoo Ring Schizont sgSpz Trophozoite
## 1558.0 3088.0 392.5 938.0 419.0 2225.0
#write.csv(tf, file="rawmediantotalfeatures.csv")
QC based on txnal profile
mcasmall <- mca[, (colData(mca)$ShortenedLifeStage2 == "Merozoite") | (colData(mca)$ShortenedLifeStage2 == "ooSpz") | (colData(mca)$ShortenedLifeStage2 == "Ring") | (colData(mca)$ShortenedLifeStage2 == "sgSpz") ]
mcamedium <- mca[, (colData(mca)$ShortenedLifeStage2 == "bbSpz") | (colData(mca)$ShortenedLifeStage2 == "Schizont") | (colData(mca)$ShortenedLifeStage2 == "Trophozoite")]
mcalarge <- mca[, (colData(mca)$ShortenedLifeStage2 == "EEF") | (colData(mca)$ShortenedLifeStage2 == "Female") | (colData(mca)$ShortenedLifeStage2 == "Male") | (colData(mca)$ShortenedLifeStage2 == "ook") | (colData(mca)$ShortenedLifeStage2 == "ookoo") | (colData(mca)$ShortenedLifeStage2 == "oocyst")]
############MCA SMALL
# Filter cells with low counts
filter_by_total_counts <- (mcasmall$total_counts > 400)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE TRUE
## 102 666
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcasmall$total_features > 40)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE TRUE
## 75 693
# filter out control samples
filter_by_control <- colData(mcasmall)$is_control == FALSE
table(colData(mcasmall)$is_control, colData(mcasmall)$ShortenedLifeStage2)
##
## Merozoite Ring sgSpz
## FALSE 249 282 188
## TRUE 39 6 4
table(filter_by_control)
## filter_by_control
## FALSE TRUE
## 49 719
# Filter data
mcasmall$use <- (
# sufficient features (genes)
filter_by_expr_features &
# sufficient molecules counted
filter_by_total_counts &
# controls shouldn't be used in downstream analysis
filter_by_control
)
table(mcasmall$use)
##
## FALSE TRUE
## 151 617
############MCA MEDIUM
# Filter cells with low counts
filter_by_total_counts <- (mcamedium$total_counts > 2500)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE TRUE
## 4 421
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcamedium$total_features > 500)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE TRUE
## 8 417
# filter out control samples
filter_by_control <- colData(mcamedium)$is_control == FALSE
table(colData(mcamedium)$is_control, colData(mcamedium)$ShortenedLifeStage2)
##
## bbSpz Schizont Trophozoite
## FALSE 110 246 67
## TRUE 2 0 0
table(filter_by_control)
## filter_by_control
## FALSE TRUE
## 2 423
# Filter data
mcamedium$use <- (
# sufficient features (genes)
filter_by_expr_features &
# sufficient molecules counted
filter_by_total_counts &
# controls shouldn't be used in downstream analysis
filter_by_control
)
table(mcamedium$use, mcamedium$ShortenedLifeStage2)
##
## bbSpz Schizont Trophozoite
## FALSE 8 0 0
## TRUE 104 246 67
table(mcasmall$use, mcasmall$ShortenedLifeStage)
##
## Merozoite Ring sgSpz
## FALSE 59 53 39
## TRUE 229 235 153
############MCA LARGE
# Filter cells with low counts
filter_by_total_counts <- (mcalarge$total_counts > 2500)
table(filter_by_total_counts)
## filter_by_total_counts
## FALSE TRUE
## 75 778
# Filter cells with low numbers of features detected
filter_by_expr_features <- (mcalarge$total_features > 1000)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE TRUE
## 93 760
# filter out control samples
filter_by_control <- colData(mcalarge)$is_control == FALSE
table(colData(mcalarge)$is_control, colData(mcalarge)$ShortenedLifeStage2)
##
## EEF Female Male oocyst ook ookoo
## FALSE 186 129 75 133 129 188
## TRUE 3 0 0 0 6 4
table(filter_by_control)
## filter_by_control
## FALSE TRUE
## 13 840
# Filter data
mcalarge$use <- (
# sufficient features (genes)
filter_by_expr_features &
# sufficient molecules counted
filter_by_total_counts &
# controls shouldn't be used in downstream analysis
filter_by_control
)
table(mcamedium$use, mcamedium$ShortenedLifeStage)
##
## bbSpz Shz
## FALSE 8 0
## TRUE 104 313
table(mcasmall$use, mcasmall$ShortenedLifeStage)
##
## Merozoite Ring sgSpz
## FALSE 59 53 39
## TRUE 229 235 153
table(mcalarge$use, mcalarge$ShortenedLifeStage)
##
## EEF oocyst ook ookoo Shz
## FALSE 30 27 32 8 3
## TRUE 159 106 103 184 201
mca <- cbind(mcasmall, mcamedium)
mca <- cbind(mca, mcalarge)
tab2 <- table(mca$use, mca$ShortenedLifeStage2)
#make QCed SingleCellExperiment
mca.qc.cells <- mca[ , colData(mca)$use]
meds <- tapply(colData(mca.qc.cells)$total_features, colData(mca.qc.cells)$ShortenedLifeStage2, median)
meds
## bbSpz EEF Female Male Merozoite oocyst
## 868.0 3226.0 2528.0 1584.0 200.0 3317.5
## ook ookoo Ring Schizont sgSpz Trophozoite
## 1638.0 3104.0 460.0 938.0 448.0 2225.0
write.csv(meds, file="QCediantotalfeatures2.csv")
table <- table(mca.qc.cells$use, mca.qc.cells$ShortenedLifeStage2)
table
##
## bbSpz EEF Female Male Merozoite oocyst ook ookoo Ring Schizont
## TRUE 104 159 127 74 229 106 103 184 235 246
##
## sgSpz Trophozoite
## TRUE 153 67
info <- as.data.frame(colData(mca.qc.cells))
ggplot(info, aes(x=ShortenedLifeStage2, total_features)) + geom_violin() + coord_flip() + stat_summary(fun.y = median, geom = "point", size=2, color="red") + theme_minimal()
write.csv(tab2, file = "cellcounts2.csv", quote = FALSE)
#Identifying outliers by PCA
plotPCA(mca.qc.cells,
size_by = "total_features",
colour_by = "seqrunnum",
pca_data_input = "colData")
#Looks at the distribution of reads for expression levels of genes
#scater::plotQC(mca, type = "highest-expression")
# Gene filtering (Best to do cells first to get rid of genes only expressed in crappy cells)
# e.g. greater than 1 reads in at least 2 cells
filter_genes <- apply(counts(mca[ , colData(mca)$use]), 1, function(x) length(x[x >= 1]) >= 2)
table(filter_genes)
## filter_genes
## FALSE TRUE
## 89 5156
rowData(mca)$use <- filter_genes
dim(mca[rowData(mca)$use, colData(mca)$use])
## [1] 5156 1787
assay(mca, "logcounts_raw") <- log2(counts(mca) + 1)
reducedDim(mca) <- NULL
#make final QCed SingleCellExperiment
mca.qc <- mca[rowData(mca)$use, colData(mca)$use]
saveRDS(mca.qc, file="mca.qc_20180625.rds")
Normalisation using TMM based on IDC, OOKOO, EEF, SPZ, SEX
mca.qc.ookoo <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "ook") | (colData(mca.qc)$ShortenedLifeStage == "ookoo") | (colData(mca.qc)$ShortenedLifeStage == "oocyst")]
mca.qc.eef <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "EEF")]
mca.qc.spz <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "bbSpz") |
(colData(mca.qc)$ShortenedLifeStage == "sgSpz") |
(colData(mca.qc)$ShortenedLifeStage == "ooSpz")]
mca.qc.idc <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage == "Merozoite") |
(colData(mca.qc)$ShortenedLifeStage == "Ring") |
(colData(mca.qc)$ShortenedLifeStage2 == "Schizont") |
(colData(mca.qc)$ShortenedLifeStage2 == "Trophozoite")]
mca.qc.sex <- mca.qc[, (colData(mca.qc)$ShortenedLifeStage2 == "Male") | (colData(mca.qc)$ShortenedLifeStage2 == "Female")]
mca.qc.ookoo.tmm <- scater::normaliseExprs(mca.qc.ookoo, method = "TMM")
#mca.qc.ooc.tmm <- scater::normaliseExprs(mca.qc.ooc, method = "TMM")
mca.qc.eef.tmm <- scater::normaliseExprs(mca.qc.eef, method = "TMM")
mca.qc.spz.tmm <- scater::normaliseExprs(mca.qc.spz, method = "TMM")
mca.qc.idc.tmm <- scater::normaliseExprs(mca.qc.idc, method = "TMM")
mca.qc.sex.tmm <- scater::normaliseExprs(mca.qc.sex, method = "TMM")
mca.qc.tmm <- cbind(mca.qc.ookoo.tmm, mca.qc.eef.tmm)
#mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.eef.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.spz.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.idc.tmm)
mca.qc.tmm <- cbind(mca.qc.tmm, mca.qc.sex.tmm)
pca <- plotPCA(mca.qc.tmm, colour_by = "ShortenedLifeStage2", exprs_values = "logcounts", ntop = 50)
pcatab <- pca$data
ggplot(pcatab, aes(PC1, PC2)) + geom_point(aes(colour=factor(colour_by))) + scale_color_manual(values = cols) + theme_classic()
saveRDS(mca.qc.tmm, file="MCAqcTMM_20180625.rds")
mca.qc.tmm_counts <- counts(mca.qc.tmm)
mca.qc.tmm_normcounts <- normcounts(mca.qc.tmm)
normlogcounts <- log2(normcounts(mca.qc.tmm) + 1)
qcpheno <- colData(mca.qc.tmm)
write.csv(qcpheno, file = "QC_pheno_20180625.csv", quote = FALSE, row.names = FALSE)
write.csv(mca.qc.tmm_counts, file = "mca.qc.tmm_counts_20180625.csv", quote = FALSE)
write.csv(mca.qc.tmm_normcounts, file = "mca.qc.tmm_norm_counts_20180625.csv", quote = FALSE)
write.csv(normlogcounts, file = "mca.qc.tmm_norm_log_counts_20180625.csv", quote = FALSE)
pca <- plotPCA(mca.qc.tmm, exprs_values = "logcounts", ntop = 500, colour_by = "ShortenedLifeStage2", ncomp=3)
dat <- pca$data
plot_ly(dat,
x = ~PC1,
y = ~PC2,
z = ~PC3,
type = 'scatter3d',
mode = 'markers',
#hoverinfo = "text",
#text = ~paste0(gene_id, " ", gene_name),
color = ~colour_by,
colors = c("bbSpz" = "navy", "EEF"="darkorange", "Merozoite"="lightpink", "oocyst"="steelblue", "ook" = "turquoise4", "ooSpz" = "lightskyblue", "Ring"="hotpink", "sgSpz"= "royalblue", "Schizont" = "violetred", "Male"="purple", "Female"="purple4", "ookoo" = "mediumturquoise", "Trophozoite"="violet"),
marker = list(size = 3)
)
Normalize with TMM on everything together
mca.qc.tmmall <- scater::normaliseExprs(mca.qc, method = "TMM")
pca <- plotPCA(mca.qc.tmmall, colour_by = "ShortenedLifeStage2", exprs_values = "logcounts", ntop = 50)
pcatab <- pca$data
ggplot(pcatab, aes(PC1, PC2)) + geom_point(aes(colour=factor(colour_by))) + scale_color_manual(values = cols) + theme_classic()
saveRDS(mca.qc.tmmall, file="MCAqcTMMall_20180625.rds")
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: OS X El Capitan 10.11.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 plotly_4.7.1
## [3] scater_1.6.3 SingleCellExperiment_1.0.0
## [5] SummarizedExperiment_1.8.1 DelayedArray_0.4.1
## [7] matrixStats_0.53.1 GenomicRanges_1.30.3
## [9] GenomeInfoDb_1.14.0 IRanges_2.12.0
## [11] S4Vectors_0.16.0 ggplot2_2.2.1
## [13] Biobase_2.38.0 BiocGenerics_0.24.0
## [15] Matrix_1.2-14
##
## loaded via a namespace (and not attached):
## [1] viridis_0.5.1 httr_1.3.1 tidyr_0.8.1
## [4] edgeR_3.20.9 jsonlite_1.5 bit64_0.9-7
## [7] viridisLite_0.3.0 shiny_1.1.0 assertthat_0.2.0
## [10] highr_0.6 blob_1.1.1 vipor_0.4.5
## [13] GenomeInfoDbData_1.0.0 yaml_2.1.18 progress_1.1.2
## [16] pillar_1.2.1 RSQLite_2.1.0 backports_1.1.2
## [19] lattice_0.20-35 glue_1.2.0 limma_3.34.9
## [22] digest_0.6.15 promises_1.0.1 XVector_0.18.0
## [25] colorspace_1.3-2 cowplot_0.9.2 htmltools_0.3.6
## [28] httpuv_1.4.3 plyr_1.8.4 XML_3.98-1.11
## [31] pkgconfig_2.0.1 biomaRt_2.34.2 zlibbioc_1.24.0
## [34] purrr_0.2.5 xtable_1.8-2 scales_0.5.0
## [37] later_0.7.2 tibble_1.4.2 lazyeval_0.2.1
## [40] magrittr_1.5 mime_0.5 memoise_1.1.0
## [43] evaluate_0.10.1 beeswarm_0.2.3 shinydashboard_0.7.0
## [46] tools_3.4.4 data.table_1.10.4-3 prettyunits_1.0.2
## [49] stringr_1.3.1 locfit_1.5-9.1 munsell_0.4.3
## [52] AnnotationDbi_1.40.0 compiler_3.4.4 rlang_0.2.0.9001
## [55] rhdf5_2.22.0 grid_3.4.4 RCurl_1.95-4.10
## [58] tximport_1.6.0 htmlwidgets_1.2 rjson_0.2.15
## [61] crosstalk_1.0.0 labeling_0.3 bitops_1.0-6
## [64] rmarkdown_1.9 gtable_0.2.0 DBI_1.0.0
## [67] reshape2_1.4.3 R6_2.2.2 gridExtra_2.3
## [70] knitr_1.20 dplyr_0.7.5 bit_1.1-12
## [73] bindr_0.1.1 rprojroot_1.3-2 ggbeeswarm_0.6.0
## [76] stringi_1.2.2 Rcpp_0.12.17 tidyselect_0.2.4